In [1]:
import pandas as pd
import numpy as np
In [2]:
def precission(y_hat, y_true):
    tp = np.sum(y_true[y_hat == 1])
    tot_p = np.sum(y_hat)
    
    return tp / tot_p

def recall(y_hat, y_true):
    tp = np.sum(y_true[y_hat == 1])
    fn = np.sum(y_true[y_hat==0])
    
    return tp / (tp + fn)
In [3]:
ds = pd.read_csv('./radar_points.txt')
ds.seq = ds.seq.astype(int)
ds.object_label = ds.object_label.astype(int)
ds.head()
Out[3]:
id stamp seq x y z probability relative_radial_velocity relative_lateral_velocity cross_section distance_rms angle_rms radial_velocity_rms is_cylindrical absolute_radial_velocity belongs_to_object object_label
0 0 1.523894e+09 52 476.396881 559.627075 0.3 0.999 0.000000 0.0 -9.0 0.0 0.0 0.0 0.0 0.000000 0.0 0
1 1 1.523894e+09 52 479.438354 566.931030 0.3 0.250 1.100242 0.0 -3.5 0.0 0.0 0.0 0.0 1.100242 0.0 0
2 2 1.523894e+09 52 480.062286 566.430298 0.3 0.750 1.488896 0.0 -12.5 0.0 0.0 0.0 0.0 1.488896 0.0 0
3 3 1.523894e+09 52 480.721039 568.209595 0.3 0.250 0.677458 0.0 -5.5 0.0 0.0 0.0 0.0 0.677458 0.0 0
4 4 1.523894e+09 52 481.002228 568.240417 0.3 0.750 0.225063 0.0 -9.5 0.0 0.0 0.0 0.0 0.225063 0.0 0

Что тут есть?

  • cross_section - можно думать, что это логарифм площади

  • relative_lateral_velocity - скорость относительно радара, радиальная ее часть

  • absolute_radial_velocity - скорость относительно мира

  • belongs_to_object - попала ли точка по нашему мнению на машину

Эвристика №1

Машины большие и едут быстро

In [4]:
y_hat = np.zeros(len(ds.values))
#dummy_mask = TODO
dummy_mask = ds.absolute_radial_velocity > 2
y_hat[dummy_mask] = 1
print('precission', precission(y_hat, ds.belongs_to_object.values))
print('recall', recall(y_hat, ds.belongs_to_object.values))
precission 0.3082339135536867
recall 0.11514575874917805

Эвристика №2

Машины отражают по несколько точек

In [5]:
dummy_cars = ds[dummy_mask][['x', 'y']].values

from sklearn.neighbors import NearestNeighbors

#TODO
index = NearestNeighbors()
index.fit(dummy_cars)
d, _ = index.kneighbors(ds[['x', 'y']], n_neighbors=2)

y_hat = np.zeros(len(ds.values))

y_hat[(d < 2.0)[:, 0]] = 1.0

print('precission', precission(y_hat, ds.belongs_to_object.values))
print('recall', recall(y_hat, ds.belongs_to_object.values))
precission 0.1570003605335897
recall 0.9544823555198363
In [6]:
import matplotlib.pyplot as plt
from IPython import display
import time
In [7]:
def visualize_ds(ds):
    ax = plt.gca()
    for seq in sorted(ds.seq.unique()):
        ax.clear()
        scene = ds[ds.seq == seq]
        ax.set_title(seq)
        plt.scatter(scene.x, scene.y, s=5, c=scene.relative_radial_velocity)
        ax.set_xlim((440,500))
        ax.set_ylim((552,582))
        display.clear_output(wait=True)
        display.display(plt.gcf())
        time.sleep(0.01)
    display.clear_output()
In [46]:
visualize_ds(ds)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-46-2c67130c87a9> in <module>
----> 1 visualize_ds(ds)

<ipython-input-45-a03f1c1dce4c> in visualize_ds(ds)
      9         ax.set_ylim((550,585))
     10         display.clear_output(wait=True)
---> 11         display.display(plt.gcf())
     12         time.sleep(0.01)
     13     display.clear_output()

~/anaconda3/lib/python3.7/site-packages/IPython/core/display.py in display(include, exclude, metadata, transient, display_id, *objs, **kwargs)
    304             publish_display_data(data=obj, metadata=metadata, **kwargs)
    305         else:
--> 306             format_dict, md_dict = format(obj, include=include, exclude=exclude)
    307             if not format_dict:
    308                 # nothing to display (e.g. _ipython_display_ took over)

~/anaconda3/lib/python3.7/site-packages/IPython/core/formatters.py in format(self, obj, include, exclude)
    178             md = None
    179             try:
--> 180                 data = formatter(obj)
    181             except:
    182                 # FIXME: log the exception

</Users/dskravchenko/anaconda3/lib/python3.7/site-packages/decorator.py:decorator-gen-9> in __call__(self, obj)

~/anaconda3/lib/python3.7/site-packages/IPython/core/formatters.py in catch_format_error(method, self, *args, **kwargs)
    222     """show traceback on failed format call"""
    223     try:
--> 224         r = method(self, *args, **kwargs)
    225     except NotImplementedError:
    226         # don't warn on NotImplementedErrors

~/anaconda3/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    339                 pass
    340             else:
--> 341                 return printer(obj)
    342             # Finally look for special method names
    343             method = get_real_method(obj, self.print_method)

~/anaconda3/lib/python3.7/site-packages/IPython/core/pylabtools.py in <lambda>(fig)
    242 
    243     if 'png' in formats:
--> 244         png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
    245     if 'retina' in formats or 'png2x' in formats:
    246         png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))

~/anaconda3/lib/python3.7/site-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs)
    126 
    127     bytes_io = BytesIO()
--> 128     fig.canvas.print_figure(bytes_io, **kw)
    129     data = bytes_io.getvalue()
    130     if fmt == 'svg':

~/anaconda3/lib/python3.7/site-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, **kwargs)
   2080                     orientation=orientation,
   2081                     bbox_inches_restore=_bbox_inches_restore,
-> 2082                     **kwargs)
   2083             finally:
   2084                 if bbox_inches and restore_bbox:

~/anaconda3/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args, **kwargs)
    525 
    526         else:
--> 527             FigureCanvasAgg.draw(self)
    528             renderer = self.get_renderer()
    529             with cbook._setattr_cm(renderer, dpi=self.figure.dpi), \

~/anaconda3/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py in draw(self)
    386         self.renderer = self.get_renderer(cleared=True)
    387         with RendererAgg.lock:
--> 388             self.figure.draw(self.renderer)
    389             # A GUI class may be need to update a window using this draw, so
    390             # don't forget to call the superclass.

~/anaconda3/lib/python3.7/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     36                 renderer.start_filter()
     37 
---> 38             return draw(artist, renderer, *args, **kwargs)
     39         finally:
     40             if artist.get_agg_filter() is not None:

~/anaconda3/lib/python3.7/site-packages/matplotlib/figure.py in draw(self, renderer)
   1707             self.patch.draw(renderer)
   1708             mimage._draw_list_compositing_images(
-> 1709                 renderer, self, artists, self.suppressComposite)
   1710 
   1711             renderer.close_group('figure')

~/anaconda3/lib/python3.7/site-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    133     if not_composite or not has_images:
    134         for a in artists:
--> 135             a.draw(renderer)
    136     else:
    137         # Composite any adjacent images together

~/anaconda3/lib/python3.7/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     36                 renderer.start_filter()
     37 
---> 38             return draw(artist, renderer, *args, **kwargs)
     39         finally:
     40             if artist.get_agg_filter() is not None:

~/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_base.py in draw(self, renderer, inframe)
   2643             renderer.stop_rasterizing()
   2644 
-> 2645         mimage._draw_list_compositing_images(renderer, self, artists)
   2646 
   2647         renderer.close_group('axes')

~/anaconda3/lib/python3.7/site-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    133     if not_composite or not has_images:
    134         for a in artists:
--> 135             a.draw(renderer)
    136     else:
    137         # Composite any adjacent images together

~/anaconda3/lib/python3.7/site-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     36                 renderer.start_filter()
     37 
---> 38             return draw(artist, renderer, *args, **kwargs)
     39         finally:
     40             if artist.get_agg_filter() is not None:

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in draw(self, renderer, *args, **kwargs)
   1202         renderer.open_group(__name__)
   1203 
-> 1204         ticks_to_draw = self._update_ticks()
   1205         ticklabelBoxes, ticklabelBoxes2 = self._get_tick_bboxes(ticks_to_draw,
   1206                                                                 renderer)

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in _update_ticks(self)
   1078         the axes.  Return the list of ticks that will be drawn.
   1079         """
-> 1080         major_locs = self.get_majorticklocs()
   1081         major_labels = self.major.formatter.format_ticks(major_locs)
   1082         major_ticks = self.get_major_ticks(len(major_locs))

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in get_majorticklocs(self)
   1323     def get_majorticklocs(self):
   1324         """Get the array of major tick locations in data coordinates."""
-> 1325         return self.major.locator()
   1326 
   1327     def get_minorticklocs(self):

~/anaconda3/lib/python3.7/site-packages/matplotlib/ticker.py in __call__(self)
   2076     def __call__(self):
   2077         vmin, vmax = self.axis.get_view_interval()
-> 2078         return self.tick_values(vmin, vmax)
   2079 
   2080     def tick_values(self, vmin, vmax):

~/anaconda3/lib/python3.7/site-packages/matplotlib/ticker.py in tick_values(self, vmin, vmax)
   2084         vmin, vmax = mtransforms.nonsingular(
   2085             vmin, vmax, expander=1e-13, tiny=1e-14)
-> 2086         locs = self._raw_ticks(vmin, vmax)
   2087 
   2088         prune = self._prune

~/anaconda3/lib/python3.7/site-packages/matplotlib/ticker.py in _raw_ticks(self, vmin, vmax)
   2023         if self._nbins == 'auto':
   2024             if self.axis is not None:
-> 2025                 nbins = np.clip(self.axis.get_tick_space(),
   2026                                 max(1, self._min_n_ticks - 1), 9)
   2027             else:

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in get_tick_space(self)
   2173         ends = self.axes.transAxes.transform([[0, 0], [1, 0]])
   2174         length = ((ends[1][0] - ends[0][0]) / self.axes.figure.dpi) * 72
-> 2175         tick = self._get_tick(True)
   2176         # There is a heuristic here that the aspect ratio of tick text
   2177         # is no more than 3:1

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in _get_tick(self, major)
   1904         else:
   1905             tick_kw = self._minor_tick_kw
-> 1906         return XTick(self.axes, 0, '', major=major, **tick_kw)
   1907 
   1908     def _get_label(self):

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in __init__(self, axes, loc, label, size, width, color, tickdir, pad, labelsize, labelcolor, zorder, gridOn, tick1On, tick2On, label1On, label2On, major, labelrotation, grid_color, grid_linestyle, grid_linewidth, grid_alpha, **kw)
    159         self.tick2line = self._get_tick2line()
    160         self.gridline = self._get_gridline()
--> 161         self.label1 = self._get_text1()
    162         self.label2 = self._get_text2()
    163 

~/anaconda3/lib/python3.7/site-packages/matplotlib/axis.py in _get_text1(self)
    434             color=self._labelcolor,
    435             verticalalignment=vert,
--> 436             horizontalalignment=horiz,
    437             )
    438         t.set_transform(trans)

~/anaconda3/lib/python3.7/site-packages/matplotlib/text.py in __init__(self, x, y, text, color, verticalalignment, horizontalalignment, multialignment, fontproperties, rotation, linespacing, rotation_mode, usetex, wrap, **kwargs)
    165             linespacing = 1.2   # Maybe use rcParam later.
    166         self._linespacing = linespacing
--> 167         self.set_rotation_mode(rotation_mode)
    168         self.update(kwargs)
    169 

~/anaconda3/lib/python3.7/site-packages/matplotlib/text.py in set_rotation_mode(self, m)
    245             ``"anchor"``, then alignment occurs before rotation.
    246         """
--> 247         if m is None or m in ["anchor", "default"]:
    248             self._rotation_mode = m
    249         else:

KeyboardInterrupt: 

==== homework starts here ====

forming datasets

In [8]:
ds = ds.sort_values(by=['seq'])
seqs = ds.seq.unique()
nseqs = len(seqs)

train_part = 0.5
val_part = 0.3

train_end = int(train_part * nseqs)
val_end = train_end + int(val_part * nseqs)

train_seqs = seqs[:train_end]
val_seqs = seqs[train_end:val_end]
test_seqs = seqs[val_end:]
In [9]:
train_ds = ds[np.isin(ds['seq'], train_seqs)]
val_ds = ds[np.isin(ds['seq'], val_seqs)]
test_ds = ds[np.isin(ds['seq'], test_seqs)]
In [10]:
COL_DROPS = ['stamp', 'id', 'seq', 'object_label', 'belongs_to_object']

X_train = train_ds.drop(COL_DROPS, axis=1)
y_train = train_ds['belongs_to_object'].astype('int32')

X_val = val_ds.drop(COL_DROPS, axis=1)
y_val = val_ds['belongs_to_object'].astype('int32')

X_test = test_ds.drop(COL_DROPS, axis=1)
y_test = test_ds['belongs_to_object'].astype('int32')

X_train.head()
Out[10]:
x y z probability relative_radial_velocity relative_lateral_velocity cross_section distance_rms angle_rms radial_velocity_rms is_cylindrical absolute_radial_velocity
0 476.396881 559.627075 0.3 0.999 0.000000 0.0 -9.0 0.0 0.0 0.0 0.0 0.000000
22 483.860718 561.203735 0.3 0.250 1.152332 0.0 0.0 0.0 0.0 0.0 0.0 1.152332
23 483.565857 566.687622 0.3 0.250 0.000000 0.0 5.0 0.0 0.0 0.0 0.0 0.000000
24 476.996277 576.006470 0.3 0.250 7.993592 0.0 11.5 0.0 0.0 0.0 0.0 7.993592
25 479.329987 567.058044 0.3 0.999 0.000000 0.0 -8.0 0.0 0.0 0.0 0.0 0.000000
In [11]:
len(X_train), len(X_val), len(X_test), len(ds)
Out[11]:
(33116, 41380, 24621, 99117)
In [48]:
visualize_ds(train_ds)

Catboost classifier

In [12]:
import catboost

def learn(X_train, X_val, y_train, y_val):
    pool = catboost.Pool(X_train, y_train)
    clf = catboost.CatBoostClassifier(
        custom_loss=['AUC', 'Accuracy'],
        n_estimators=1000,
        depth=6,
        learning_rate=1e-2,
        rsm=0.7,
        l2_leaf_reg=1.5,
    )
    clf.fit(
        pool,
        early_stopping_rounds=100,
        use_best_model=True, 
        eval_set=(X_val.values, y_val.values),
        plot=True,
        verbose=False,
    )
    return clf
In [13]:
cls = learn(X_train, X_val, y_train, y_val)
In [14]:
from sklearn.metrics import precision_recall_curve, precision_score, recall_score

import plotly
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import tqdm

def test_one(clf, X_test, y_test):
    y_test_hat = clf.predict_proba(X_test)
    pr, rec, thr = precision_recall_curve(y_test, y_test_hat[:, 1])
    ix = np.linspace(1, len(pr)-1, num=2000).astype(int)
    return pr[ix], rec[ix], thr[ix - 1]


# def heuristic_filter_scoring():
#     pr = []
#     rec = []
#     filter_range = list(range(1, 10))
#     for i in filter_range:
#         y_test_heuristic_hat = np.ones(len(heu_X_test))
#         y_test_heuristic_hat[filter_by_intensity(heu_X_test.intensity, i)] = 0
#         pr.append(precision_score(heu_y_test, y_test_heuristic_hat))
#         rec.append(recall_score(heu_y_test, y_test_heuristic_hat))
        
#     return pr, rec, filter_range

# pr_bl, rec_bl, thr_bl = heuristic_filter_scoring()

def plot_pr_rec(*models):
    traces = []
    for model, clf, X_test, y_test in models:
        pr, rec, thr = test_one(clf, X_test, y_test)
        pr_rec = go.Scattergl(x = rec, y = pr, mode='lines', text=thr, name=model)
        traces.append(pr_rec)

#     pr_rec_bl = go.Scatter(x = rec_bl, y = pr_bl, mode='lines+markers', text=thr_bl, name='Intensity BL')

    layout = go.Layout(
        title='Precission-recall',
        xaxis=dict(
            title='Recall'
        ),
        yaxis=dict(
            title='Precission'
        ))
    fig = go.Figure(
#         data=traces + [pr_rec_bl],
         data=traces,
        layout=layout)
    py.iplot(fig)
    
models = [('my classifier', cls, X_test, y_test)]
plot_pr_rec(*models)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [19]:
from sklearn.neighbors import KDTree

class ComputeFeatures(object):
    def __init__(self):
        self.r = [1.0]
        self.base_names = [
                'relative_radial_velocity',
                'relative_lateral_velocity',
                'cross_section',
                'distance_rms',
                'angle_rms',
                'radial_velocity_rms',
                'is_cylindrical',
                'absolute_radial_velocity']
        self.r_names = [
                'region_npoints',
                'avg_absolute_radial_velocity',
                'median_absolute_radial_velocity']

    def _feature_names(self):
        names = []
        names += self.base_names
        for r in self.r:
            names += list(map(lambda n: '%s_%s' % (n, r), self.r_names))
        return names
    
    def compute_point_features(self, point_id):
        point_features = []
        
        for n in self.base_names:
            f = self.seq_ds[n].values[point_id]
            point_features.append(f)
        
        for r in self.r:
            neighbours = self.get_point_neighbours(point_id, r)
            r_features = []
            for f in self.r_names:
                if f.statswith('avg_'):
                    f = f[4:]
                    vals = self.seq_ds[[f]].values[neighbours]
                    v = np.average(vals)
                    r_features.append(v)
                elif f.startswith('median_'):
                    f = f[7:]
                    vals = self.seq_ds[[f]].values[neighbours]
                    v = np.median(vals)
                    r_features.append(v)
                elif f == 'region_npoints':
                    r_features.append(len(neighbours))
                else:
                    raise 'Unknown feature'
            point_features += r_features
        return point_features

    def get_point_neighbours(self, point_id, r):
        return self.index.query_radius(self.xyz[point_id][np.newaxis, :], r=r)[0]
        
    def __call__(self, seq_ds):
        self.xyz = seq_ds[['x', 'y', 'z']].values[:]
        self.seq_ds = seq_ds
        
        self.index = KDTree(self.xyz)
        
        features = []
        for point_id in range(len(self.xyz)):
            features.append(self.compute_point_features(point_id))
        
        return pd.DataFrame(columns=self._feature_names(), data=features)
In [20]:
from tqdm import tqdm
import os

f_computer = ComputeFeatures()

features = []
for seq in tqdm(seqs):
    file_name = './features/' + str(seq) + '.csv'
    if not os.path.isfile(file_name):
        seq_ds = ds[ds['seq'] == seq]
        seq_f = f_computer(seq_ds)
        seq_f.to_csv(file_name)
        features.append(f_computer(seq_ds))
    else:
        features.append(pd.read_csv(file_name))
100%|██████████| 981/981 [00:02<00:00, 342.77it/s]
In [21]:
features[0].head()
Out[21]:
Unnamed: 0 relative_radial_velocity relative_lateral_velocity cross_section distance_rms angle_rms radial_velocity_rms is_cylindrical absolute_radial_velocity region_npoints_1.5 avg_absolute_radial_velocity_1.5 median_absolute_radial_velocity_1.5
0 0 0.000000 0.0 -9.0 0.0 0.0 0.0 0.0 0.000000 1 0.000000 0.000000
1 1 1.152332 0.0 0.0 0.0 0.0 0.0 0.0 1.152332 2 0.576166 0.576166
2 2 0.000000 0.0 5.0 0.0 0.0 0.0 0.0 0.000000 2 0.000000 0.000000
3 3 7.993592 0.0 11.5 0.0 0.0 0.0 0.0 7.993592 4 5.970561 7.944326
4 4 0.000000 0.0 -8.0 0.0 0.0 0.0 0.0 0.000000 5 1.006443 1.100242
In [22]:
X_train = features[:train_end]
X_train = pd.concat(X_train)

X_val = features[train_end:val_end]
X_val = pd.concat(X_val)

X_test = features[val_end:]
X_test = pd.concat(X_test)
In [23]:
import catboost

def learn(X_train, X_val, y_train, y_val):
    pool = catboost.Pool(X_train, y_train)
    clf = catboost.CatBoostClassifier(
        custom_loss=['AUC', 'Accuracy'],
        n_estimators=1000,
        depth=6,
        learning_rate=1e-2,
        rsm=0.1,
        l2_leaf_reg=2,
    )
    clf.fit(
        pool,
        early_stopping_rounds=100,
        use_best_model=True, 
        eval_set=(X_val.values, y_val.values),
        plot=True,
        verbose=False,
    )
    return clf
In [24]:
cls = learn(X_train, X_val, y_train, y_val)
In [30]:
models = [('my classifier', cls, X_test, y_test)]
plot_pr_rec(*models)

Well, a bit better result that when using heuristics, though not that much

In [32]:
THR = 0.273

y_hat = cls.predict_proba(X_test)[:, 1]
y_hat[y_hat > THR] = 1
y_hat[y_hat <= THR] = 0

print('precission', precission(y_hat, y_test))
print('recall', recall(y_hat, y_test))
precission 0.3096671949286846
recall 0.29615034859048195
In [ ]: